# Import project py file
from life_analysis import *
# Import general libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from textwrap import wrap
import warnings
warnings.filterwarnings('ignore')
# Import modelling libraries
import scipy.stats as stats
from itertools import combinations
from sklearn.preprocessing import StandardScaler, scale, Normalizer, PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, LassoLarsCV, LassoLarsIC
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Import raw data
raw = pd.read_csv('analytic_data2019.csv', skiprows=[1,2])
display(raw.shape)
display(raw.head())
# Remove unnecessary columns that contain these text
filter_out = ['numerator', 'denominator', 'CI low', 'CI high', '%', 'death',
'percentage', 'Percentage', 'ratio', 'Ratio', 'mortality']
# Clean up characters from column names
replace_dict = {' raw value':'', ' - ':'_', '-':'_', '=':'', '/':'_',
'(':'', ')':'', '.':'', '+':' above', ' ':'_'}
# Manually drop irrelevant columns
drop_list = ['Uninsured_adults', 'Uninsured_children', 'Population']
# Run cleansing function from life_analysis.py
df = data_cleansing(raw, filter_out, replace_dict, drop_list)
df = df.reset_index(drop=True)
# Assign a copy of cleansed df for mapping
df0 = df.copy()
# Basis of df to use for modelling. Skip columns 0-6 which are non-numerical
df = df.drop(df.iloc[:, 0:7], axis = 1)
cols = list(df.columns)
df.shape
# Remove outliers that lies k std deviations away from mean
k = 3.5
# Run remove outlier function from life_analysis.py
df = remove_outliers(df, k)
# Run log transformation function from life_analysis.py
target = 'Life_expectancy'
df = log_transform(df, target)
# Plot all cleansed columns in df using:
# 1. Scatter plot to view relationship vs. target
# 2. Histogram to view spread of data set
# Run plot function from life_analysis.py
num_of_plots_per_row = 6
plot_all(df, num_of_plots_per_row, target)
# Plot heatmap by US county
import plotly.figure_factory as ff
values = df0['Life_expectancy'].tolist()
fips = df0['5_digit_FIPS_Code'].tolist()
colorscale = ['#ff0000', '#fe8000', '#e5aa00', '#b7b600', '#75a700', '#008000']
fig = ff.create_choropleth(
fips=fips, values=values, scope=['USA'],
binning_endpoints=[70,75,77,79,84], colorscale=colorscale,
county_outline={'color': 'rgb(255,255,255)', 'width': 0}, round_legend_values=True,
legend_title='Life Expectancy by US County')
fig.layout.template = None
fig.show()
# Split parameters
seed = 1
test_size = 1000/df.shape[0]
# Split target and predictors
y_all = df['Life_expectancy']
x_all = df.drop('Life_expectancy', axis=1)
# Split 80/20 for training/testing
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=test_size, random_state=seed)
# Fit to train set
scaler = StandardScaler()
scaler.fit(x_train)
# Assign and transform training data
x_train = pd.DataFrame(scaler.transform(x_train), columns=x_all.columns) # transform training data
y_train = (pd.DataFrame(y_train)).reset_index(drop=True)
dftrain = pd.concat([y_train, x_train], axis=1)
# Assign and transform test data
x_test = pd.DataFrame(scaler.transform(x_test), columns=x_all.columns) # transform training data
y_test = (pd.DataFrame(y_test)).reset_index(drop=True)
# Cross validation using all columns
regression = LinearRegression()
crossvalidation = KFold(n_splits=5, shuffle=True, random_state=seed)
baseline_rsq = np.mean(cross_val_score(regression, x_train, y_train, scoring='r2', cv=crossvalidation))
baseline_mse = -np.mean(cross_val_score(regression, x_train, y_train, cv=5, scoring='neg_mean_squared_error'))
print(f'No of predictors: {x_train.shape[1]}')
print(f'kfold cv r-sq: {round(baseline_rsq,4)}')
print(f'kfold cv MSE : {round(baseline_mse,4)}')
model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(x_train, y_train)
lasso = Lasso(alpha = model_aic.alpha_)
lasso.fit(x_train, y_train)
lasso_rsq = lasso.score(x_train, y_train)
lasso_mse = mean_squared_error(y_train, lasso.predict(x_train))
print(f'No of predictors : {x_train.shape[1]}')
print(f'Optimum AIC alpha : {round(model_aic.alpha_,4)}')
print(f'After Lasso regularization r-sq: {round(lasso_rsq,4)}')
print(f'After Lasso regularization MSE : {round(lasso_mse,4)}')
# Check p-value using StatsModel (not available in Scikit-learn)
outcome = 'Life_expectancy'
x_cols = list(x_train.columns)
predictors = '+'.join(x_cols)
formula = outcome + '~' + predictors
model = ols(formula=formula, data=dftrain).fit()
summary = model.summary()
pt = summary.tables[1]
pt = pd.DataFrame(pt.data)
pt.columns = pt.iloc[0]
pt = pt.drop(pt.index[:2])
pt = pt[['','P>|t|','coef']]
pt = pt.rename(columns={'':'Predictor'})
pt = pt.rename(columns={'P>|t|':'Pvalue'})
pt['Pvalue'] = pt['Pvalue'].astype(float)
pt['coef'] = pt['coef'].astype(float)
pt['sign'] = pt['coef'].map(lambda row: '+' if row > 0 else '-')
pt['coef'] = pt['coef'].map(lambda row: abs(row))
pt = pt.sort_values(['Pvalue', 'coef'], ascending=[1, 0])
pt = pt[pt['Pvalue'] < 0.05]
print(f'List of {len(pt)} predictors with p-value < 0.05')
display(pt)
list_predictors = list(pt['Predictor'])
# Calculate correlation coeff with life expectancy for all predictors
cols = list(dftrain.columns)
corr_list = []
for col in cols:
corr = stats.pearsonr(dftrain[col], dftrain.Life_expectancy)
corr_list.append([abs(round(corr[0],3)), col])
# Show sorted list of columns and correlations between life expectancy vs. column and log(column)
corr_list = sorted(corr_list, key=lambda i: i[0], reverse=True)
k = 20
print(f'Top {k} Correlation between all predictors vs. Life expectancy:')
display(corr_list[1:k+1])
# Top p-values
top_n = 20
top_predictors = list_predictors[:top_n]
print(f'Top {top_n} strongest linear correlation with Life Expectancy:')
display(top_predictors)
# Check correlation between top columns - to avoid multicollinearity
combo1 = list(combinations(top_predictors, 2))
corr_list1 = []
for comb in combo1:
corr = stats.pearsonr(dftrain[comb[0]], dftrain[comb[1]])
corr_list1.append([abs(round(corr[0],3)), comb[0], comb[1]])
# Show sorted list of columns and correlations between life expectancy vs. column and log(column)
corr_list1 = sorted(corr_list1, key=lambda i: i[0], reverse=True)
print('Correlation between predictors to avoid multicollinearity (0.8 cut-off):')
display(list(filter(lambda i: i[0] > 0.8, corr_list1)))
# Drop top predictors based on multicollinearity etc:
drop_list = ['Frequent_physical_distress', 'Poor_physical_health_days', 'Median_household_income_White']
top_predictors = [i for i in top_predictors if i not in drop_list]
len(top_predictors)
# Validate list via correlation heatmap between top predictors after drop:
sns.heatmap(abs(df[top_predictors].corr()), cmap="PiYG", center=0)
plt.show()
# Cross validation using top predictors only
x_train = x_train[top_predictors[:]]
regression = LinearRegression()
crossvalidation = KFold(n_splits=5, shuffle=True, random_state=seed)
model1_rsq = np.mean(cross_val_score(regression, x_train, y_train, scoring='r2', cv=crossvalidation))
model1_mse = -np.mean(cross_val_score(regression, x_train, y_train, cv=5, scoring='neg_mean_squared_error'))
print(f'No of predictors: {x_train.shape[1]}')
print(f'kfold cv r-sq: {round(model1_rsq,4)}')
print(f'kfold cv MSE : {round(model1_mse,4)}')
# Check interactions between pairs of top predictors
regression = LinearRegression()
crossvalidation = KFold(n_splits=5, shuffle=True, random_state=seed)
combo2 = list(combinations(x_train.columns, 2))
interactions = []
data = pd.DataFrame(scale(x_train), columns=x_train.columns)
for comb in combo2:
data['interaction'] = data[comb[0]] * data[comb[1]]
score = np.mean(cross_val_score(regression, data, y_train, scoring='r2', cv=crossvalidation))
if score > model1_rsq: interactions.append((round(score, 3), comb[0], comb[1]))
interactions = sorted(interactions, key=lambda i: i[0], reverse=True)
k = 20
print(f'Max value of r2: {max(interactions)[0]}')
print(f'Min value of r2: {min(interactions)[0]}')
print(f'Top {k} Interaction cross validation r2 between predictors:')
display(interactions[:k])
# Explore polynomial terms
degrees = [2, 3]
polynomials = []
for col in x_train.columns:
for degree in degrees:
data = x_train.copy()
poly = PolynomialFeatures(degree, include_bias=False)
x_transformed = poly.fit_transform(x_train[[col]])
data = pd.concat([data.drop(col, axis=1),pd.DataFrame(x_transformed)], axis=1)
score = np.mean(cross_val_score(regression, data, y_train, scoring='r2', cv=crossvalidation))
polynomials.append((round(score, 3), col, degree))
polynomials = sorted(polynomials, key=lambda i: i[0], reverse=True)
k=20
print(f'Top {k} r2 of polynomial terms for top predictors for degree={degrees}')
display(polynomials[:20])
# Show Top 20:
# Polynomial terms: select columns with highest cv r2
poly_columns = ['Teen_births', 'Adult_smoking', 'Physical_inactivity']
# Interaction terms: select pairs of columns with highest cv r2
inter_columns = [['Physical_inactivity', 'Adult_smoking']]
# Build new df with Polynomial terms
dfpoly = pd.DataFrame()
for col in poly_columns:
poly = PolynomialFeatures(3, include_bias=False)
x_transformed = poly.fit_transform(x_train[[col]])
colnames= [col, col + '_' + 'power2', col + '_' + 'power3']
dfpoly = (pd.concat([dfpoly, pd.DataFrame(x_transformed, columns=colnames)], axis=1)).drop([col], axis=1)
# Build new df with Interaction terms
dfinter = pd.DataFrame()
for col in inter_columns:
colname = f'{col[0]}_AND_{col[1]}'
dfinter[colname] = x_train[col[0]] * x_train[col[1]]
# Merge with all predictors
df_shortlist = pd.concat([y_train, x_train, dfpoly, dfinter], axis=1)
x_shortlist = df_shortlist.drop('Life_expectancy', axis=1)
x_shortlist.columns
# Calculate new baseline with shortlisted columns, interactions and polynomial terms
x_train = x_shortlist
regression = LinearRegression()
crossvalidation = KFold(n_splits=5, shuffle=True, random_state=seed)
model2_rsq = np.mean(cross_val_score(regression, x_train, y_train, scoring='r2', cv=crossvalidation))
model2_mse = -np.mean(cross_val_score(regression, x_train, y_train, cv=5, scoring='neg_mean_squared_error'))
print(f'No of predictors: {x_train.shape[1]}')
print(f'kfold cv r-sq: {round(model2_rsq,4)}')
print(f'kfold cv MSE : {round(model2_mse,4)}')
# Calculate correlation coeff with Life Expectancy:
cols = list(df_shortlist.iloc[:, 1:].columns) # skip Life expectancy
corr_list = []
top_predictors = []
for col in cols:
# Correlation with x
corr = stats.pearsonr(df_shortlist[col], df_shortlist.Life_expectancy)
# Correlation with log x
transformer = Normalizer().fit([df_shortlist[col]])
t = transformer.transform([df_shortlist[col]])
if df_shortlist[col].min() <= 0:
corrlog = 0
else:
corrlog = stats.pearsonr(np.log(t[0]), df_shortlist.Life_expectancy)[0]
# Output list
corr_list.append([abs(round(corr[0],3)), col])
top_predictors.append(col)
# Show sorted list of columns and correlations between life expectancy vs. column and log(column)
corr_list = sorted(corr_list, key=lambda i: i[0], reverse=True)
print(f'Sorted correlation between all predictors vs. Life expectancy:')
display(corr_list[:])
# Calculate coefficients
linreg = LinearRegression()
linreg.fit(x_train, y_train)
coeff = linreg.coef_
coeff_list = []
for n, col in enumerate(cols):
coeff_list.append([abs(round(coeff[0][n],3)), 'Negative' if coeff[0][n] < 0 else 'Positive', col])
coeff_list = sorted(coeff_list, key=lambda i: i[0], reverse=True)
coeff_list
# Investigate residuals
dfplot = pd.concat([y_train, x_train], axis=1)
outcome = 'Life_expectancy'
x_cols = list(x_train.columns)
predictors = '+'.join(x_cols)
formula = outcome + '~' + predictors
model2 = ols(formula=formula, data=dfplot).fit()
summary = model2.summary()
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))
# Residual plot: check that residuals are non-deterministic by checking that
# there are no discernable patterns from the plot
axes[0].scatter(model2.predict(x_train), model2.resid, color='blue', alpha=0.3)
axes[0].axhline(y=0, color='black')
axes[0].set_xlim(72,82)
axes[0].set_ylim(-5,5)
axes[0].set_xlabel('Fitted value')
axes[0].set_title('Homoscedasticity of residuals check: \n Scatter plot', fontsize=14)
# Q-Q plot: check that residuals are normally distributed by checking that
# residuals lie within the central line
fig = sm.graphics.qqplot(model2.resid, dist=stats.norm, line='45', fit=True, ax=axes[1])
axes[1].axhline(y=0, color='black')
axes[1].set_xlim(-4,4)
axes[1].set_ylim(-4,4)
axes[1].set_title('Normality of residuals check: \n Q-Q plot', fontsize=14)
plt.show()
# Conclusions: Residuals do not have deterministic characteristics (random) and thus homoscedastic.
# Also, the residuals are largely normally distributed based on Q-Q plot although the tail ends could be
# improved further despite the fact that outliers have been largely removed during data cleansing.
# Assign training data to the final choice of predictors
x_train = x_shortlist
# Test data still has the same number of original columns.
# Now reshape test data columns to match the final set of columns in train data:
x_test_raw = x_test
final_columns = list(x_train.columns)
df_test = pd.DataFrame()
for col in final_columns:
# Create interaction terms
if col.find('_AND_') != -1:
c = 1
inter1 = col[0:col.find('_AND_')]
inter2 = col[(-1)*(len(col)-5-len(inter1)):]
c = [inter1,inter2]
df_test[f'{inter1}_AND_{inter2}'] = x_test_raw[inter1] * x_test_raw[inter2]
# Create polynomial terms
elif col.find('_power') != -1:
polycol = col[:len(col)-7]
power = int(col[-1:])
c = [polycol, power]
df_test[f'{polycol}_power{power}'] = x_test_raw[polycol]
# Original terms
else:
c = col
df_test[col] = x_test_raw[col]
# Assign test data (after processing)
x_test = pd.DataFrame(df_test, columns=df_test.columns) # scale training data
y_test = (pd.DataFrame(y_test)).reset_index(drop=True)
# Double check data dimensions are the comparable
print(f'x_train:{x_train.shape}, y_train:{y_train.shape}, x_test:{x_test.shape}, y_test:{y_test.shape}')
# Run model with training and test data
linreg = LinearRegression()
linreg.fit(x_train, y_train)
pred_train = linreg.predict(x_train)
pred_test = linreg.predict(x_test)
final_train_rsq = linreg.score(x_train, y_train)
final_test_rsq = linreg.score(x_test, y_test)
final_train_mse = mean_squared_error(y_train, pred_train)
final_test_mse = mean_squared_error(y_test, pred_test)
print('Summary of r^2 improvements')
print('---------------------------')
print('1. Baseline :', round(baseline_rsq,4))
print('2. Baseline with Lasso :', round(lasso_rsq,4))
print('3. Model 1 using strongest corr :', round(model1_rsq,4))
print('4. Model 2 incl poly & interacts :', round(model2_rsq,4))
print('5. Model 3 (final) with Train set:', round(final_train_rsq,4))
print('6. Model 3 (final) with Test set :', round(final_test_rsq,4))
print('')
print('Summary of MSE improvements')
print('---------------------------')
print('1. Baseline :', round(baseline_mse,4))
print('2. Baseline with Lasso :', round(lasso_mse,4))
print('3. Model 1 using strongest corr :', round(model1_mse,4))
print('4. Model 2 incl poly & interacts :', round(model2_mse,4))
print('5. Model 3 (final) with Train set:', round(final_train_mse,4))
print('6. Model 3 (final) with Test set :', round(final_test_mse,4))
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))
axes[0].scatter(y_train, pred_train, label='Model', color='blue', alpha=0.3)
axes[0].plot(y_train, y_train, label='TRAIN data', color='blue')
axes[1].scatter(y_test, pred_test, label='Model', color='red', alpha=0.3)
axes[1].plot(y_test, y_test, label='TEST data', color='red')
axes[0].set_xlim(65,90)
axes[0].set_ylim(65,90)
axes[1].set_xlim(65,90)
axes[1].set_ylim(65,90)
axes[0].set_ylabel('Life expectancy')
axes[0].legend(loc='upper left')
axes[1].legend(loc='upper left')
axes[0].set_title(f'TRAIN data vs. Model \n sample n={x_train.shape[0]}, MSE={round(final_train_mse,2)}, r^2={round(final_train_rsq,2)}', fontsize=15)
axes[1].set_title(f'TEST data vs. Model \n sample n={x_test.shape[0]}, MSE={round(final_test_mse,2)}, r^2={round(final_test_rsq,2)}', fontsize=15)
plt.legend()
plt.show()
From the study and models conducted on the impact of various health and lifestyle factors to life expectancy in the US, we came up with following key conclusions: